What this does: Loads OECD-CRS (TXT, one file per year) and GODAD (project, ADM1, ADM2), harmonizes fields, restricts to 2000–2020, and produces diagnostics & visuals: totals over time, coverage heatmap, sector share deltas, identity checks across levels (project/ADM1/ADM2), concentration (Gini), unspecified-region shares, splitting behavior, and an ADM1 map for a case country. Parameters above let you tweak quickly.
Expected files (place them into
params$godad_dir):
GODAD_projectlevel.csvGODAD_adm1.csvGODAD_adm2.csv(optional but recommended)CRS files should be named like:
CRS <year> Data.txtinsideparams$crs_dir.
# R pkgs
need <- c("reticulate","dplyr","stringr","forcats","ggplot2","scales","tibble","purrr","uwot","stopwords")
to_install <- setdiff(need, rownames(installed.packages()))
if(length(to_install)) install.packages(to_install, quiet=TRUE)
invisible(lapply(need, library, character.only = TRUE))
# R pkgs
n
## function ()
## {
## peek_mask()$get_current_group_size()
## }
## <bytecode: 0x10bf9a030>
## <environment: namespace:dplyr>
# Helpers
fmt_dollar <- scales::label_dollar(scale_cut = scales::cut_short_scale())
theme_beau <- function(base_size=12){
ggplot2::theme_minimal(base_size = base_size) +
theme(plot.title=element_text(face="bold"),
plot.subtitle=element_text(color="#555"),
plot.caption=element_text(color="#777", size=9),
panel.grid.minor=element_blank(),
legend.position="top")
}
options(stringsAsFactors = FALSE)
# Faster strings
options(stringsAsFactors = FALSE)
fmt_dollar <- scales::label_dollar(scale_cut = scales::cut_short_scale()) # $1k, $1M, ...
fmt_si <- scales::label_number(scale_cut = scales::cut_si("")) # 1k, 1M, 1G
fmt_pct <- scales::label_percent(accuracy = 0.1)
# Access parameters explicitly
data_file <- params$data_file
amount_kind <- params$amount
use_crs <- params$map_crs
stopifnot(file.exists(data_file))
# Select only columns we'll actually use to save memory
cols_needed <- c(
"project_id","project_location_id","donor",
"gid_0","gid_1","gid_2","name_0","name_1","name_2",
"latitude","longitude","startyear","closingyear","paymentyear",
"title","description","receiving_agencies","implementing_agencies","funding_agencies",
"financial_type","flow_class","status","precision_code","location_count",
"sector_codes","sector_categories","sector_name",
"comm","comm_nominal","comm_ibrd","comm_ida",
"disb","disb_nominal","disb_ibrd","disb_ida",
"comm_loc_evensplit","disb_loc_evensplit","purpose_code","place_name"
)
proj_raw <- vroom::vroom(
data_file,
col_select = any_of(cols_needed),
altrep = TRUE,
progress = TRUE,
num_threads = max(1, parallel::detectCores() - 1),
show_col_types = FALSE
) %>% janitor::clean_names()
## indexing GODAD_projectlevel.csv [-------------------------] 302.36GB/s, eta: 0sindexing GODAD_projectlevel.csv [=================----------] 1.91GB/s, eta: 0sindexing GODAD_projectlevel.csv [==================---------] 1.91GB/s, eta: 0sindexing GODAD_projectlevel.csv [===================--------] 1.90GB/s, eta: 0sindexing GODAD_projectlevel.csv [====================-------] 1.90GB/s, eta: 0sindexing GODAD_projectlevel.csv [====================-------] 1.87GB/s, eta: 0sindexing GODAD_projectlevel.csv [=====================------] 1.88GB/s, eta: 0sindexing GODAD_projectlevel.csv [=======================----] 1.86GB/s, eta: 0sindexing GODAD_projectlevel.csv [=======================----] 1.83GB/s, eta: 0sindexing GODAD_projectlevel.csv [========================---] 1.81GB/s, eta: 0sindexing GODAD_projectlevel.csv [=========================--] 1.78GB/s, eta: 0sindexing GODAD_projectlevel.csv [==========================-] 1.73GB/s, eta: 0sindexing GODAD_projectlevel.csv [===========================] 1.67GB/s, eta: 0sindexing GODAD_projectlevel.csv [===========================] 1.68GB/s, eta: 0s
# Basic cleaning & derived fields
proj <- proj_raw %>%
mutate(
iso3_rec = toupper(gid_0), # GADM country code = ISO3
year_commit = suppressWarnings(as.integer(startyear)), # commitment-ish year
year_disb = suppressWarnings(as.integer(paymentyear)),
amount_commit = suppressWarnings(as.numeric(comm)),
amount_disb = suppressWarnings(as.numeric(disb)),
has_coords = !is.na(latitude) & !is.na(longitude),
desc_len = nchar(description %||% "")
)
# Which year dimension to prefer in global charts
proj <- proj %>%
mutate(year = if (amount_kind == "disb") year_disb else year_commit)
# Quick snapshot
list(
n_rows = nrow(proj),
time_window_commit = range(proj$year_commit, na.rm=TRUE),
time_window_disb = range(proj$year_disb, na.rm=TRUE),
pct_with_desc = mean(!is.na(proj$description) & nzchar(proj$description), na.rm=TRUE)
)
## $n_rows
## [1] 903253
##
## $time_window_commit
## [1] 1900 2023
##
## $time_window_disb
## [1] 1989 2024
##
## $pct_with_desc
## [1] 0.9505078
# --- Project counts over time ---
ts_counts <- bind_rows(
proj %>% count(year_commit, name="n") %>% mutate(type="Projects (commit-year)", year=year_commit),
proj %>% count(year_disb, name="n") %>% mutate(type="Projects (disb-year)", year=year_disb)
) %>% filter(!is.na(year)) %>% arrange(year)
yr_min <- min(ts_counts$year, na.rm = TRUE)
yr_max <- max(ts_counts$year, na.rm = TRUE)
n_obs <- scales::comma(nrow(proj))
sub_txt <- paste0("Project–location counts by year • ", yr_min, "–", yr_max)
cap_txt <- "Lines show counts of project–location records by year, using either commitment or disbursement year. \
Counts are raw and not adjusted for project size or funding."
p_ts_n <- ggplot(ts_counts, aes(year, n, color = type)) +
geom_line(linewidth = 0.9) +
scale_color_manual(values = c("Projects (commit-year)" = "#3182bd", "Projects (disb-year)" = "#e6550d")) +
scale_y_continuous(labels = fmt_si, expand = expansion(mult = c(0.02, 0.08))) +
scale_x_continuous(expand = expansion(mult = c(0.01, 0.01))) +
labs(
title = "Project counts over time",
subtitle = sub_txt,
x = NULL, y = "Number of project–locations",
color = NULL,
caption = stringr::str_wrap(cap_txt, width = 95)
) +
theme_beau() +
theme(
legend.position = "top",
panel.grid.minor = element_blank(),
plot.margin = margin(5, 10, 35, 5)
)
p_ts_n
# --- Financial volumes over time ---
ts_amounts <- bind_rows(
proj %>% group_by(year_commit) %>% summarise(total = sum(amount_commit, na.rm=TRUE), .groups="drop") %>%
mutate(type="Total commitments (USD)", year=year_commit),
proj %>% group_by(year_disb) %>% summarise(total = sum(amount_disb, na.rm=TRUE), .groups="drop") %>%
mutate(type="Total disbursements (USD)", year=year_disb)
) %>% filter(!is.na(year))
yr_min2 <- min(ts_amounts$year, na.rm = TRUE)
yr_max2 <- max(ts_amounts$year, na.rm = TRUE)
sub_txt2 <- paste0("Annual totals (USD) • ", yr_min2, "–", yr_max2)
cap_txt2 <- "Lines show the sum of all commitments and disbursements by year (project–locations aggregated). \
Totals are not deflated or normalized."
p_ts_amt <- ggplot(ts_amounts, aes(year, total, color = type)) +
geom_line(linewidth = 0.9) +
scale_color_manual(values = c("Total commitments (USD)" = "#3182bd", "Total disbursements (USD)" = "#e6550d")) +
scale_y_continuous(labels = fmt_dollar, expand = expansion(mult = c(0.02, 0.08))) +
scale_x_continuous(expand = expansion(mult = c(0.01, 0.01))) +
labs(
title = "Financial volumes over time",
subtitle = sub_txt2,
x = NULL, y = "USD",
color = NULL,
caption = stringr::str_wrap(cap_txt2, width = 95)
) +
theme_beau() +
theme(
legend.position = "top",
panel.grid.minor = element_blank(),
plot.margin = margin(5, 10, 35, 5)
)
p_ts_amt
proj_yeared <- proj %>%
mutate(year = if (amount_kind == "disb") year_disb else year_commit) %>%
filter(!is.na(year), !is.na(donor), nzchar(donor))
# Top donors by count (or keep your original selection)
top_donors <- proj_yeared %>%
count(donor, name = "n") %>%
slice_max(n, n = 20) %>%
pull(donor)
yr_min <- min(proj_yeared$year, na.rm = TRUE)
yr_max <- max(proj_yeared$year, na.rm = TRUE)
# Absolute counts (with full year grid)
donor_heat <- proj_yeared %>%
filter(donor %in% top_donors) %>%
count(donor, year, name = "n") %>%
group_by(donor) %>%
tidyr::complete(year = seq(yr_min, yr_max, 1), fill = list(n = 0)) %>%
mutate(tot = sum(n)) %>%
ungroup() %>%
mutate(donor = forcats::fct_reorder(donor, tot))
# Normalized (per-donor share)
donor_heat_norm <- donor_heat %>%
group_by(donor) %>%
mutate(share = n / pmax(tot, 1)) %>%
ungroup()
# -------- Plot ABSOLUTE COUNTS --------
ggplot(donor_heat, aes(year, donor, fill = n)) +
geom_tile(color = "white", linewidth = 0.15) +
scale_fill_viridis_c(labels = fmt_si, name = "Projects", trans = "sqrt") +
scale_x_continuous(expand = expansion(mult = c(0.002, 0.002))) +
labs(title = "Activity heatmap — top donors by project count",
subtitle = paste0("Top 20 donors • ", yr_min, "–", yr_max),
x = NULL, y = NULL,
caption = stringr::str_wrap("Tiles show number of project–locations per donor & year; missing years filled with 0.", 95)) +
theme_beau() +
theme(panel.grid = element_blank(), legend.position = "right",
plot.margin = margin(5, 10, 35, 5))
# -------- Plot NORMALIZED SHARES --------
ggplot(donor_heat_norm, aes(year, donor, fill = share)) +
geom_tile(color = "white", linewidth = 0.15) +
scale_fill_viridis_c(labels = scales::label_percent(accuracy = 1), name = "Share of donor’s projects") +
scale_x_continuous(expand = expansion(mult = c(0.002, 0.002))) +
labs(title = "Activity heatmap — per-donor share by year",
subtitle = paste0("Each row scaled to that donor’s total • ", yr_min, "–", yr_max),
x = NULL, y = NULL,
caption = stringr::str_wrap("Highlights relative hot years within each donor; not comparable across donors.", 95)) +
theme_beau() +
theme(panel.grid = element_blank(), legend.position = "right",
plot.margin = margin(5, 10, 35, 5))
desc_stats <- tibble(
`Non-missing descriptions` = mean(!is.na(proj$description) & nzchar(proj$description)),
`Median description length` = median(proj$desc_len[proj$desc_len>0], na.rm=TRUE),
`P90 description length` = quantile(proj$desc_len[proj$desc_len>0], 0.9, na.rm=TRUE)
)
DT::datatable(round(desc_stats, 3), caption = "Description coverage & length")
top10_donors <- proj %>%
filter(!is.na(donor) & nzchar(donor)) %>%
count(donor, wt = amount_commit, sort = TRUE) %>%
slice_head(n = 10) %>%
pull(donor)
dlen <- proj %>%
filter(donor %in% top10_donors, desc_len > 0) %>%
mutate(desc_len = pmin(desc_len, 1200L)) # clamp long tails for readability
# stats for subtitle/caption
sub_txt <- paste0(
"Top 10 donors by committed amount • ",
scales::comma(nrow(dlen)), " descriptions • length truncated at 1,200 chars"
)
cap_txt <- "Each panel shows the density of description lengths for a donor. \
Vertical line marks the median length. Extremely long descriptions are truncated at 1,200 characters for readability."
# per-donor medians
meds <- dlen %>%
group_by(donor) %>%
summarise(med = median(desc_len, na.rm = TRUE), .groups = "drop")
p_desc <- ggplot(dlen, aes(desc_len, after_stat(density), color = donor)) +
geom_freqpoly(bins = 60, alpha = 0.85, linewidth = 0.7, show.legend = FALSE) +
geom_vline(data = meds, aes(xintercept = med, color = donor), linetype = 2, linewidth = 0.6, show.legend = FALSE) +
coord_cartesian(xlim = c(0, 1200)) +
facet_wrap(~ donor, ncol = 2, scales = "free_y") +
labs(
title = "Project description length distribution",
subtitle = sub_txt,
x = "Characters", y = "Density",
caption = stringr::str_wrap(cap_txt, width = 95)
) +
theme_beau() +
theme(
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold"),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
p_desc
if (!requireNamespace("ggridges", quietly = TRUE)) install.packages("ggridges", quiet = TRUE)
library(ggridges)
p_desc_ridge <- ggplot(dlen, aes(x = desc_len, y = reorder(donor, desc_len, median), fill = donor)) +
ggridges::geom_density_ridges(scale = 1.1, rel_min_height = 0.01, alpha = 0.85, color = "white", linewidth = 0.2, show.legend = FALSE) +
geom_vline(data = meds, aes(xintercept = med), linetype = 2, linewidth = 0.6, color = "grey30") +
coord_cartesian(xlim = c(0, 1200)) +
labs(
title = "Project description length distribution (ridgelines)",
subtitle = sub_txt,
x = "Characters", y = NULL,
caption = stringr::str_wrap(cap_txt, width = 95)
) +
theme_beau() +
theme(panel.grid.minor = element_blank())
p_desc_ridge
## Picking joint bandwidth of 20
# ---- TF–IDF by donor (clean & robust) ----
if (!"tidytext" %in% rownames(installed.packages())) install.packages("tidytext", quiet = TRUE)
if (!"stopwords" %in% rownames(installed.packages())) install.packages("stopwords", quiet = TRUE)
library(tidytext); library(stopwords)
# 1) multilingual stopwords + domain terms
stop_tbl <- tibble::tibble(
token = unique(c(
stopwords("en"), stopwords("fr"), stopwords("es"), stopwords("de"),
stopwords("it"), stopwords("pt"), stopwords("nl"), stopwords("no"),
tolower(c("project","programme","program","aid","loan","grant","credit",
"world","bank","government","support","development","ministry",
"department","agency","project’s","projects","projet", "financement", "china", "chinese", "indian", "usaid"))
))
)
# 2) pick top donors by amount (commit/disburse consistent with params$amount)
metric_vec <- if (amount_kind == "disb") proj$amount_disb else proj$amount_commit
top10_donors <- proj %>%
mutate(metric = metric_vec) %>%
filter(!is.na(donor), nzchar(donor)) %>%
group_by(donor) %>% summarise(total = sum(metric, na.rm = TRUE), .groups="drop") %>%
slice_max(total, n = 10) %>% pull(donor)
# 3) build corpus and tokenize
corpus <- proj %>%
filter(donor %in% top10_donors, !is.na(description), nzchar(description)) %>%
transmute(doc = donor, text = stringr::str_to_lower(description))
tokens <- corpus %>%
tidytext::unnest_tokens(token, text, token = "words", strip_numeric = TRUE) %>%
filter(stringr::str_detect(token, "^[a-z]{3,}$")) %>% # ≥3 letters only
anti_join(stop_tbl, by = "token")
# 4) TF–IDF (donor = document)
tfidf <- tokens %>%
count(doc, token, sort = TRUE) %>%
bind_tf_idf(term = token, document = doc, n = n)
# 5) Filter extremes: drop tokens in only 1 donor or in all donors
docfreq <- tfidf %>%
group_by(token) %>% summarise(df = n_distinct(doc), .groups="drop")
n_docs <- n_distinct(tfidf$doc)
tfidf_clean <- tfidf %>%
inner_join(docfreq, by = "token") %>%
filter(df >= 2, df <= n_docs - 1)
# Stats for subtitle/caption
n_docs <- dplyr::n_distinct(tfidf_clean$doc)
n_tokens <- dplyr::n_distinct(tfidf_clean$term %||% tfidf_clean$token)
sub_txt <- paste0("Top TF–IDF tokens by donor • ", n_docs, " donors • ",
scales::comma(n_tokens), " unique tokens after filtering")
cap_txt <- "TF–IDF computed with tidytext::bind_tf_idf(). Tokens are 3+ letters, lowercased; \
multilingual + domain stopwords removed; tokens appearing in only one donor or in all donors were dropped. \
Bars are ordered within facets."
# Ensure we have 'token' column name
if (!"token" %in% names(tfidf_clean)) tfidf_clean <- dplyr::rename(tfidf_clean, token = term)
top_tfidf <- tfidf_clean %>%
dplyr::group_by(doc) %>%
dplyr::slice_max(order_by = tf_idf, n = 10, with_ties = FALSE) %>%
dplyr::ungroup() %>%
dplyr::mutate(token = tidytext::reorder_within(token, tf_idf, doc))
ggplot(top_tfidf, aes(tf_idf, token)) +
geom_col(fill = "#3182bd", alpha = 0.9, width = 0.85) +
tidytext::scale_y_reordered() +
facet_wrap(~ doc, scales = "free_y", ncol = 2) +
scale_x_continuous(labels = scales::label_number(accuracy = 0.001)) +
labs(
title = "Top description tokens by TF–IDF (per donor)",
subtitle = sub_txt,
x = "TF–IDF", y = NULL,
caption = stringr::str_wrap(cap_txt, width = 95)
) +
theme_beau() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold"),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
# Long format for quick compare
amt_long <- bind_rows(
proj %>% select(project_id, donor, iso3_rec, year = year_commit, amount = amount_commit) %>% mutate(kind="Commit"),
proj %>% select(project_id, donor, iso3_rec, year = year_disb, amount = amount_disb) %>% mutate(kind="Disburse")
) %>% filter(!is.na(amount), amount > 0)
# Histogram of amounts (log scale)
sub_hist <- paste0(
scales::comma(nrow(amt_long)), " project–amount observations • ",
min(amt_long$year, na.rm=TRUE), "–", max(amt_long$year, na.rm=TRUE)
)
cap_hist <- "Histogram of project-level financial amounts (commitments/disbursements combined). \
X-axis is log-scaled (USD); each bin counts number of project–amount observations. \
Values ≤ 0 are excluded."
p_hist <- ggplot(amt_long, aes(amount)) +
geom_histogram(bins = 60, fill = "#3182bd", color = "white", linewidth = 0.2, alpha = 0.9) +
scale_x_log10(labels = fmt_dollar) +
labs(
title = "Amount distribution (log scale)",
subtitle = sub_hist,
x = "USD (log10)", y = "Number of projects",
caption = stringr::str_wrap(cap_hist, width = 95)
) +
theme_beau() +
theme(
panel.grid.minor = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
p_hist
# Violin by donor
top_donors <- amt_long %>%
count(donor, sort = TRUE) %>%
slice_head(n = 12) %>%
pull(donor)
sub_violin <- paste0(
"Top ", length(top_donors), " donor groups by number of projects • ",
scales::comma(nrow(filter(amt_long, donor %in% top_donors))), " observations"
)
cap_violin <- "Each violin shows the distribution of project amounts for a donor, split by commitment vs disbursement. \
Widths indicate relative density; scale is log(USD). \
Only top donors by project count are shown."
p_violin <- amt_long %>%
filter(donor %in% top_donors) %>%
ggplot(aes(donor, amount, fill = kind)) +
geom_violin(scale = "width", trim = TRUE, alpha = 0.8, color = "white", linewidth = 0.2) +
scale_y_log10(labels = fmt_dollar) +
coord_flip() +
scale_fill_manual(values = c("Commit" = "#3182bd", "Disburse" = "#e6550d")) +
labs(
title = "Amount distribution by donor (top groups)",
subtitle = sub_violin,
x = NULL, y = "USD (log10)", fill = NULL,
caption = stringr::str_wrap(cap_violin, width = 95)
) +
theme_beau() +
theme(
legend.position = "top",
panel.grid.minor = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
p_violin
comm_ts <- proj %>%
transmute(year = suppressWarnings(as.integer(year_commit)),
total = suppressWarnings(as.numeric(amount_commit))) %>%
filter(!is.na(year)) %>%
group_by(year) %>%
summarise(total = sum(total, na.rm = TRUE), .groups = "drop") %>%
mutate(kind = "Commit")
disb_ts <- proj %>%
transmute(year = suppressWarnings(as.integer(year_disb)),
total = suppressWarnings(as.numeric(amount_disb))) %>%
filter(!is.na(year)) %>%
group_by(year) %>%
summarise(total = sum(total, na.rm = TRUE), .groups = "drop") %>%
mutate(kind = "Disburse")
ts_amt_long <- bind_rows(comm_ts, disb_ts) %>%
arrange(kind, year)
# Guard: if nothing to show, bail early with a message
if (nrow(ts_amt_long) == 0) {
stop("No annual totals available (both year_commit and year_disb are NA).")
}
# Nice subtitle/caption
yr_min <- min(ts_amt_long$year, na.rm = TRUE)
yr_max <- max(ts_amt_long$year, na.rm = TRUE)
sub_txt <- paste0("Annual totals (USD) • ", yr_min, "–", yr_max)
cap_txt <- "Lines show total commitments/disbursements per year (sum across all project–locations). Hover for values."
# Pre-format hover text
ts_amt_long <- ts_amt_long %>%
mutate(hover = paste0(
"<b>", kind, "</b><br>",
"Year: ", year, "<br>",
"Total: ", scales::label_dollar(scale_cut = scales::cut_short_scale())(total)
))
p <- ggplot(ts_amt_long, aes(x = year, y = total, color = kind, text = hover, group = kind)) +
geom_line(linewidth = 1, na.rm = TRUE) +
scale_color_manual(values = c("Commit" = "#3366CC", "Disburse" = "#DC3912")) +
scale_y_continuous(labels = fmt_dollar, expand = expansion(mult = c(0.02, 0.08))) +
scale_x_continuous(expand = expansion(mult = c(0.01, 0.01))) +
labs(
title = "Financial volumes over time",
subtitle = sub_txt,
x = NULL, y = "USD", color = NULL,
caption = stringr::str_wrap(cap_txt, width = 95)
) +
theme_beau() +
theme(
legend.position = "top",
panel.grid.minor = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 40, l = 5)
)
plotly::ggplotly(p, tooltip = "text") %>%
plotly::layout(legend = list(orientation = "h", x = 0, y = 1.1))
metric <- if (amount_kind == "disb") "disburse" else "commit"
by_cty <- proj %>%
group_by(iso3_rec) %>%
summarise( n_proj = n(),
commit = sum(amount_commit, na.rm=TRUE),
disburse = sum(amount_disb, na.rm=TRUE), .groups="drop" )
# world map
world <- tryCatch( rnaturalearth::ne_countries(scale = "medium", returnclass = "sf"), error = function(e) NULL )
# subtitle & caption
yr_min <- suppressWarnings(min(proj$year_commit, proj$year_disb, na.rm = TRUE))
yr_max <- suppressWarnings(max(proj$year_commit, proj$year_disb, na.rm = TRUE))
covered <- sum(!is.na(by_cty[[metric]]) & by_cty[[metric]] > 0)
pct_cov <- covered / n_distinct(by_cty$iso3_rec)
sub_txt <- paste0(
if (amount_kind == "disb") "Disbursements" else "Commitments",
" in USD • ", yr_min, "–", yr_max, " • ",
scales::percent(pct_cov, accuracy = 1), " of recipients with > 0"
)
cap_txt <- "Fill shows total amounts aggregated by recipient ISO3 across all years. Grey = no data or zero total. \
Values are on a log scale to improve contrast across large ranges. Basemap: Natural Earth."
if (!is.null(world)) {
world <- world %>%
dplyr::mutate(iso3_rec = dplyr::if_else(iso_a3 == "SDS", "SSD", iso_a3))
map_df <- dplyr::left_join(world, by_cty, by = "iso3_rec")
# set zeros to NA for better visual separation (keep outline)
map_df$fill_val <- map_df[[metric]]
map_df$fill_val[!is.na(map_df$fill_val) & map_df$fill_val <= 0] <- NA_real_
p_map <- ggplot(map_df) +
geom_sf(aes(fill = fill_val), color = "white", linewidth = 0.15) +
scale_fill_viridis_c(
name = "USD (total)",
labels = fmt_dollar,
trans = "log10",
na.value = "grey92",
guide = guide_colorbar(
barheight = unit(10, "lines"),
ticks = TRUE
)
) +
labs(
title = paste("Recipient country totals —", ifelse(amount_kind=="disb","Disbursements","Commitments")),
subtitle = sub_txt,
x = NULL, y = NULL,
caption = stringr::str_wrap(cap_txt, width = 95)
) +
theme_beau() +
theme(
legend.position = "right",
panel.grid = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
if (!is.na(use_crs) && nzchar(use_crs)) {
p_map <- tryCatch(p_map + coord_sf(crs = use_crs),
error = function(e) { message("Projection failed, using WGS84"); p_map })
}
p_map
} else {
message("World basemap unavailable (no rnaturalearth). Skipping choropleth.")
}
# build points as sf (WGS84)
pts <- proj %>%
filter(has_coords, between(longitude, -180, 180), between(latitude, -90, 90))
pts_sf <- sf::st_as_sf(pts, coords = c("longitude","latitude"), crs = 4326, remove = FALSE)
# world basemap
base_world <- tryCatch(rnaturalearth::ne_countries(scale = "medium", returnclass = "sf"), error = function(e) NULL)
if (!is.null(base_world)) base_world <- subset(base_world, admin != "Antarctica")
# Diagnostics for subtitle
n_pts <- nrow(pts_sf)
yr_min <- suppressWarnings(min(proj$year_commit, proj$year_disb, na.rm = TRUE))
yr_max <- suppressWarnings(max(proj$year_commit, proj$year_disb, na.rm = TRUE))
sub_txt <- paste0("All projects with coordinates • ", scales::comma(n_pts), " points • ", yr_min, "–", yr_max)
cap_pts <- "Each dot corresponds to a project–location row with usable latitude/longitude. \
Dots are semi-transparent to reduce overplotting, so darker regions indicate denser clusters. \
Basemap is simplified Natural Earth (no Antarctica)."
p_pts <- ggplot() +
geom_sf(data = base_world, fill = "grey98", color = "grey85", linewidth = 0.2) +
geom_sf(data = pts_sf, alpha = 0.25, size = 0.2, color = "steelblue") +
labs(
title = "Project point locations",
subtitle = sub_txt,
x = NULL, y = NULL,
caption = stringr::str_wrap(cap_pts, width = 95)
) +
theme_beau() +
theme(
legend.position = "none",
panel.grid = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
# Optional projection
if (!is.na(use_crs) && nzchar(use_crs)) {
p_pts <- tryCatch(p_pts + coord_sf(crs = use_crs),
error = function(e) { message("Projection failed, staying in WGS84"); p_pts })
}
p_pts
if (!requireNamespace("maps", quietly = TRUE)) install.packages("maps", quiet = TRUE)
world_df <- ggplot2::map_data("world")
# Helpers for subtitle/caption
n_pts <- nrow(pts)
yr_min <- suppressWarnings(min(proj$year_commit, proj$year_disb, na.rm = TRUE))
yr_max <- suppressWarnings(max(proj$year_commit, proj$year_disb, na.rm = TRUE))
sub_txt <- paste0("All projects with coordinates • ", n_pts, " points • ", yr_min, "–", yr_max)
cap_count <- "Counts are 2D hexbin tallies of project points in geographic (lon/lat) space. \
Higher counts indicate denser clusters of geocoded project locations. \
Basemap is simplified and drawn in WGS84 using quick mapping."
p_hex <- ggplot() +
geom_polygon(
data = world_df,
aes(long, lat, group = group),
fill = "grey98", color = "grey85", linewidth = 0.2
) +
stat_bin_2d(
data = pts,
aes(x = longitude, y = latitude),
bins = 130, # tune as needed (80–160 works well)
na.rm = TRUE
) +
scale_fill_viridis_c(
labels = fmt_si,
name = "Projects",
trans = "sqrt", # improves contrast between low/high counts
guide = guide_colorbar(barheight = unit(10, "lines"))
) +
coord_quickmap() +
labs(
title = "Spatial density of project points (hex bins)",
subtitle = sub_txt,
x = NULL, y = NULL,
caption = stringr::str_wrap(cap_count, width = 95)
) +
theme_beau() +
theme(
legend.position = "right",
panel.grid = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
p_hex
metric <- if (amount_kind == "disb") "amount_disb" else "amount_commit"
sub_amt <- paste0(
if (amount_kind == "disb") "Disbursements" else "Commitments",
" aggregated per hex • ", yr_min, "–", yr_max
)
cap_amt <- "Fill shows the sum of amounts (USD) within each hexagon; zeros are blank. \
Use with care: large urban clusters can dominate totals. \
Basemap is simplified and drawn in WGS84 using quick mapping."
p_hex_amt <- ggplot() +
geom_polygon(
data = world_df,
aes(long, lat, group = group),
fill = "grey98", color = "grey85", linewidth = 0.2
) +
stat_summary_2d(
data = pts,
aes(x = longitude, y = latitude, z = .data[[metric]]),
bins = 130,
fun = function(z) sum(z, na.rm = TRUE),
na.rm = TRUE
) +
scale_fill_viridis_c(
labels = fmt_dollar,
name = "USD (sum)",
trans = "sqrt", # friendlier than log(10) for zeros/low cells
guide = guide_colorbar(barheight = unit(10, "lines"))
) +
coord_quickmap() +
labs(
title = "Amount-weighted project density",
subtitle = sub_amt,
x = NULL, y = NULL,
caption = stringr::str_wrap(cap_amt, width = 95)
) +
theme_beau() +
theme(
legend.position = "right",
panel.grid = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 35, l = 5)
)
p_hex_amt
library(countrycode)
metric <- if (amount_kind == "disb") "amount_disb" else "amount_commit"
top_cty <- proj %>%
group_by(iso3_rec) %>%
summarise(val = sum(.data[[metric]], na.rm = TRUE), .groups = "drop") %>%
mutate(country = countrycode::countrycode(iso3_rec, "iso3c", "country.name"),
country = ifelse(is.na(country), iso3_rec, country)) %>%
arrange(desc(val)) %>%
slice_head(n = 25) %>%
mutate(
share = val / sum(val),
country = forcats::fct_reorder(country, val)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `country = countrycode::countrycode(iso3_rec, "iso3c",
## "country.name")`.
## Caused by warning:
## ! Some values were not matched unambiguously: UNSPECIFIED, XKO, XKX
caption_text <- paste0(
"Amounts are ", if (amount_kind == "disb") "disbursements" else "commitments in USD (as provided in the dataset), summed across all years and project–location rows. Only the Top-25 recipient countries are shown; the ‘share’ is within this Top-25 subset. Country names derived from ISO3 via {countrycode}."
)
ggplot(top_cty, aes(x = country, y = val)) +
geom_segment(aes(xend = country, y = 0, yend = val), linewidth = 1.1, color = "grey75") +
geom_point(aes(fill = val), shape = 21, size = 4, stroke = 0.2) +
geom_text(
aes(label = paste0(scales::label_dollar(scale_cut = scales::cut_short_scale())(val),
" • ", scales::percent(share, accuracy = 0.1))),
hjust = -0.05, size = 3.2, color = "#333"
) +
coord_flip(clip = "off") +
scale_fill_viridis_c(guide = "none") +
scale_y_continuous(labels = fmt_dollar, expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Top 25 recipient countries by total amount",
subtitle = if (amount_kind == "disb") "Disbursements" else "Commitments",
x = NULL, y = "USD",
caption = stringr::str_wrap(caption_text, width = 90) # wrap caption
) +
theme_beau() +
theme(
panel.grid.major.y = element_blank(),
plot.margin = margin(t = 5, r = 30, b = 40, l = 5) # more space at bottom
)
hyg <- tibble(
pct_missing_coords = mean(!proj$has_coords),
pct_missing_commit = mean(is.na(proj$amount_commit)),
pct_missing_disb = mean(is.na(proj$amount_disb)),
pct_missing_year_c = mean(is.na(proj$year_commit)),
pct_missing_year_d = mean(is.na(proj$year_disb))
)
DT::datatable(round(hyg, 3), caption = "Missingness summary (project-level)")
startyear as “commit
year” and paymentyear as “disb year”. If your dataset
encodes different timing conventions, switch the assignments at the
top.comm /
disb (non-nominal). Swap to *_nominal if you
want current USD.gid_0 is treated as
ISO3 for recipient. For choropleths I join to Natural
Earth by iso_a3; a couple of codes (e.g.,
SSD) may need tiny fixes depending on your build.arrow::open_dataset) or
save a feather/parquet for subsequent runs.